# finalized features
df.head()
| beat | cited_person_age | charge_description | month | day_of_week | hour_of_day | |
|---|---|---|---|---|---|---|
| 0 | 12.0 | 19.0 | Speed Greater Than R&P or Posted | 7 | 2 | 7 |
| 1 | 12.0 | 36.0 | Speed Greater Than R&P or Posted | 7 | 2 | 8 |
| 2 | 7.0 | 61.0 | Speed Greater Than R&P or Posted | 7 | 2 | 8 |
| 3 | 7.0 | 59.0 | Speed Greater Than R&P or Posted | 7 | 3 | 8 |
| 4 | 19.0 | 23.0 | Speed Greater Than R&P or Posted | 7 | 5 | 22 |
This clustering method works by forming clusters based on the concentration and closeness of datapoints. I believe this clustering method will be helpful since:
# find best DBSCAN parameters
cluster_count = []
sil_score = []
eps_ = []
min_samples_ = []
for eps in [.1, .5, 1]:
for min_samples in [5, 10, 15]:
# Defining the agglomerative clustering
dbscan_cluster = DBSCAN(eps=eps, min_samples=min_samples)
# Fit model
clusters = dbscan_cluster.fit_predict(X_std)
# capture cluster count
cluster_count.append(len(set(clusters)) - (1 if -1 in clusters else 0))
# capture model fit
if pd.Series(clusters).nunique() > 1:
sil_score.append(round(metrics.silhouette_score(X_std, clusters, metric='euclidean'), 3))
else:
sil_score.append(np.nan)
# record parameter values
eps_.append(eps)
min_samples_.append(min_samples)
# tabulate param results from above
# sort by silhouette score & cluster count
cluster_scores = pd.DataFrame(list(list(zip(sil_score, cluster_count, eps_, min_samples_))),
columns=['sil_score', 'cluster_count', 'eps', 'min_samples']) \
.sort_values(by=['sil_score', 'cluster_count'], ascending=False)
cluster_scores
| sil_score | cluster_count | eps | min_samples | |
|---|---|---|---|---|
| 6 | 0.197 | 499 | 1.0 | 5 |
| 7 | 0.081 | 233 | 1.0 | 10 |
| 8 | -0.004 | 143 | 1.0 | 15 |
| 0 | -0.249 | 6 | 0.1 | 5 |
| 5 | -0.385 | 50 | 0.5 | 15 |
| 4 | -0.446 | 131 | 0.5 | 10 |
| 3 | -0.462 | 565 | 0.5 | 5 |
| 1 | NaN | 0 | 0.1 | 10 |
| 2 | NaN | 0 | 0.1 | 15 |
# plot silhouette score and cluster count
fig, (ax0, ax1) = plt.subplots(nrows=2,ncols=1, sharex=True, figsize=(10, 6))
cluster_scores.loc[:,['sil_score']] \
.plot(kind='bar', color='green', legend=False, ax=ax0)
ax0.set_title('SILHOUETTE SCORE', fontsize=15)
cluster_scores.loc[:,['cluster_count']] \
.plot(kind='bar', legend=False, ax=ax1);
ax1.set_title('CLUSTER COUNT', fontsize=15)
plt.xticks(rotation=1);
dbscan_cluster = DBSCAN(eps=1, min_samples=5)
plt.figure(figsize=(6, 6))
clusters = dbscan_cluster.fit_predict(X_std)
pd.Series(clusters).value_counts().plot(kind='pie', title='Cluster Distribution')
plt.ylabel('');
With such a large number of clusters I focused on analyzing characteristics within the top 50.
pd.Series(clusters).value_counts()
-1 6517
2 2024
27 1193
4 991
1 930
...
496 3
229 3
476 3
442 2
330 2
Length: 500, dtype: int64
# concat cluster numbers to dataset
df_clusters = pd.concat([df.reset_index(drop=True), pd.Series(clusters).rename('cluster')], axis=1)
# subset dataset based on top 50 clusters by size
df_clusters = df_clusters[df_clusters.cluster.isin(list(pd.Series(clusters).value_counts()[1:51].index))]
# how many citations are included in the
# top 50 clusters
df_clusters.shape
(21002, 7)
All the clusters are made up of one categorical variable each (citation type, and location).
Both of these clusters represent 654 young people who receive DUI citations early in the day on the weekend within beats 5 & 6.
# how many people are included within these clusters
df_clusters[(df_clusters.cluster == 55) |
(df_clusters.cluster == 79)].shape
(654, 7)
fig, ax = plt.subplots(figsize=(10, 4))
cluster_attribute = 'cited_person_age'
aggregator = 'charge_description'
df_cluster_sub = df_clusters[(df_clusters.cluster == 55) |
(df_clusters.cluster == 79)]
df_cluster_sub.groupby(['cluster', aggregator]).agg(['mean']).loc[:,[cluster_attribute]].reset_index().merge(
df.groupby([aggregator]).agg(['mean']).loc[:,[cluster_attribute]].reset_index(),
how='left',
on=aggregator,
suffixes=('_cluster', '_all_data')).set_index(['cluster', aggregator]).droplevel(level=1, axis=1).plot(ax=ax,
kind='bar',
alpha=.7)
plt.title('CLUSTER DIFFERENCES - ' + cluster_attribute.upper(), fontsize=14)
plt.ylabel(cluster_attribute)
plt.legend(loc='lower right')
plt.xticks(rotation=30, horizontalalignment="right");
fig, ax = plt.subplots(figsize=(10, 4))
cluster_attribute = 'cited_person_age'
aggregator = 'beat'
df_cluster_sub = df_clusters[(df_clusters.cluster == 55) |
(df_clusters.cluster == 79)]
df_cluster_sub.groupby(['cluster', aggregator]).agg(['mean']).loc[:,[cluster_attribute]].reset_index().merge(
df.groupby([aggregator]).agg(['mean']).loc[:,[cluster_attribute]].reset_index(),
how='left',
on=aggregator,
suffixes=('_cluster', '_all_data')).set_index(['cluster', aggregator]).droplevel(level=1, axis=1).plot(ax=ax,
kind='bar',
alpha=.7)
plt.title('CLUSTER DIFFERENCES - ' + cluster_attribute.upper(), fontsize=14)
plt.ylabel(cluster_attribute)
plt.legend(loc='lower right')
plt.xticks(rotation=30, horizontalalignment="right");
fig, ax = plt.subplots(figsize=(10, 4))
cluster_attribute = 'day_of_week'
aggregator = 'charge_description'
df_cluster_sub = df_clusters[(df_clusters.cluster == 55) |
(df_clusters.cluster == 79)]
df_cluster_sub.groupby(['cluster', aggregator]).agg(['mean']).loc[:,[cluster_attribute]].reset_index().merge(
df.groupby([aggregator]).agg(['mean']).loc[:,[cluster_attribute]].reset_index(),
how='left',
on=aggregator,
suffixes=('_cluster', '_all_data')).set_index(['cluster', aggregator]).droplevel(level=1, axis=1).plot(ax=ax,
kind='bar',
alpha=.7)
plt.title('CLUSTER DIFFERENCES - ' + cluster_attribute.upper(), fontsize=14)
plt.ylabel(cluster_attribute)
plt.legend(loc='lower right')
plt.xticks(rotation=30, horizontalalignment="right");
fig, ax = plt.subplots(figsize=(10, 4))
cluster_attribute = 'hour_of_day'
aggregator = 'charge_description'
df_cluster_sub = df_clusters[(df_clusters.cluster == 55) |
(df_clusters.cluster == 79)]
df_cluster_sub.groupby(['cluster', aggregator]).agg(['mean']).loc[:,[cluster_attribute]].reset_index().merge(
df.groupby([aggregator]).agg(['mean']).loc[:,[cluster_attribute]].reset_index(),
how='left',
on=aggregator,
suffixes=('_cluster', '_all_data')).set_index(['cluster', aggregator]).droplevel(level=1, axis=1).plot(ax=ax,
kind='bar',
alpha=.7)
plt.title('CLUSTER DIFFERENCES - ' + cluster_attribute.upper(), fontsize=14)
plt.ylabel(cluster_attribute)
plt.legend(loc='lower right')
plt.xticks(rotation=30, horizontalalignment="right");
This cluster represent the latest citations happening in the day. With little preference for age, this cluster is made up of 213 people received DUI citations exclusively within beat #6 very late in the evening.
df_clusters[(df_clusters.cluster == 58)].shape
(213, 7)
fig, ax = plt.subplots(figsize=(10, 4))
cluster_attribute = 'cited_person_age'
aggregator = 'charge_description'
df_cluster_sub = df_clusters[(df_clusters.cluster == 58)]
df_cluster_sub.groupby(['cluster', aggregator]).agg(['mean']).loc[:,[cluster_attribute]].reset_index().merge(
df.groupby([aggregator]).agg(['mean']).loc[:,[cluster_attribute]].reset_index(),
how='left',
on=aggregator,
suffixes=('_cluster', '_all_data')).set_index(['cluster', aggregator]).droplevel(level=1, axis=1).plot(ax=ax,
kind='bar',
alpha=.7)
plt.title('CLUSTER DIFFERENCES - ' + cluster_attribute.upper(), fontsize=14)
plt.ylabel(cluster_attribute)
plt.legend(loc='lower right')
plt.xticks(rotation=30, horizontalalignment="right");
fig, ax = plt.subplots(figsize=(10, 4))
cluster_attribute = 'cited_person_age'
aggregator = 'beat'
df_cluster_sub = df_clusters[(df_clusters.cluster == 58)]
df_cluster_sub.groupby(['cluster', aggregator]).agg(['mean']).loc[:,[cluster_attribute]].reset_index().merge(
df.groupby([aggregator]).agg(['mean']).loc[:,[cluster_attribute]].reset_index(),
how='left',
on=aggregator,
suffixes=('_cluster', '_all_data')).set_index(['cluster', aggregator]).droplevel(level=1, axis=1).plot(ax=ax,
kind='bar',
alpha=.7)
plt.title('CLUSTER DIFFERENCES - ' + cluster_attribute.upper(), fontsize=14)
plt.ylabel(cluster_attribute)
plt.legend(loc='lower right')
plt.xticks(rotation=30, horizontalalignment="right");
fig, ax = plt.subplots(figsize=(10, 4))
cluster_attribute = 'day_of_week'
aggregator = 'charge_description'
df_cluster_sub = df_clusters[(df_clusters.cluster == 58)]
df_cluster_sub.groupby(['cluster', aggregator]).agg(['mean']).loc[:,[cluster_attribute]].reset_index().merge(
df.groupby([aggregator]).agg(['mean']).loc[:,[cluster_attribute]].reset_index(),
how='left',
on=aggregator,
suffixes=('_cluster', '_all_data')).set_index(['cluster', aggregator]).droplevel(level=1, axis=1).plot(ax=ax,
kind='bar',
alpha=.7)
plt.title('CLUSTER DIFFERENCES - ' + cluster_attribute.upper(), fontsize=14)
plt.ylabel(cluster_attribute)
plt.legend(loc='lower right')
plt.xticks(rotation=30, horizontalalignment="right");
fig, ax = plt.subplots(figsize=(10, 4))
cluster_attribute = 'hour_of_day'
aggregator = 'charge_description'
df_cluster_sub = df_clusters[(df_clusters.cluster == 58)]
df_cluster_sub.groupby(['cluster', aggregator]).agg(['mean']).loc[:,[cluster_attribute]].reset_index().merge(
df.groupby([aggregator]).agg(['mean']).loc[:,[cluster_attribute]].reset_index(),
how='left',
on=aggregator,
suffixes=('_cluster', '_all_data')).set_index(['cluster', aggregator]).droplevel(level=1, axis=1).plot(ax=ax,
kind='bar',
alpha=.7)
plt.title('CLUSTER DIFFERENCES - ' + cluster_attribute.upper(), fontsize=14)
plt.ylabel(cluster_attribute)
plt.legend(loc='lower right')
plt.xticks(rotation=30, horizontalalignment="right");
These clusters represent 509 mostly older people who received citations at a relatively early hour of the day within beat locations 12 & 13.
df_clusters[(df_clusters.cluster == 101) |
(df_clusters.cluster == 98)].shape
(509, 7)
fig, ax = plt.subplots(figsize=(10, 4))
cluster_attribute = 'cited_person_age'
aggregator = 'charge_description'
df_cluster_sub = df_clusters[(df_clusters.cluster == 101) |
(df_clusters.cluster == 98)]
df_cluster_sub.groupby(['cluster', aggregator]).agg(['mean']).loc[:,[cluster_attribute]].reset_index().merge(
df.groupby([aggregator]).agg(['mean']).loc[:,[cluster_attribute]].reset_index(),
how='left',
on=aggregator,
suffixes=('_cluster', '_all_data')).set_index(['cluster', aggregator]).droplevel(level=1, axis=1).plot(ax=ax,
kind='bar',
alpha=.7)
plt.title('CLUSTER DIFFERENCES - ' + cluster_attribute.upper(), fontsize=14)
plt.ylabel(cluster_attribute)
plt.legend(loc='lower right')
plt.xticks(rotation=30, horizontalalignment="right");
fig, ax = plt.subplots(figsize=(10, 4))
cluster_attribute = 'cited_person_age'
aggregator = 'beat'
df_cluster_sub = df_clusters[(df_clusters.cluster == 101) |
(df_clusters.cluster == 98)]
df_cluster_sub.groupby(['cluster', aggregator]).agg(['mean']).loc[:,[cluster_attribute]].reset_index().merge(
df.groupby([aggregator]).agg(['mean']).loc[:,[cluster_attribute]].reset_index(),
how='left',
on=aggregator,
suffixes=('_cluster', '_all_data')).set_index(['cluster', aggregator]).droplevel(level=1, axis=1).plot(ax=ax,
kind='bar',
alpha=.7)
plt.title('CLUSTER DIFFERENCES - ' + cluster_attribute.upper(), fontsize=14)
plt.ylabel(cluster_attribute)
plt.legend(loc='lower right')
plt.xticks(rotation=30, horizontalalignment="right");
fig, ax = plt.subplots(figsize=(10, 4))
cluster_attribute = 'day_of_week'
aggregator = 'charge_description'
df_cluster_sub = df_clusters[(df_clusters.cluster == 101) |
(df_clusters.cluster == 98)]
df_cluster_sub.groupby(['cluster', aggregator]).agg(['mean']).loc[:,[cluster_attribute]].reset_index().merge(
df.groupby([aggregator]).agg(['mean']).loc[:,[cluster_attribute]].reset_index(),
how='left',
on=aggregator,
suffixes=('_cluster', '_all_data')).set_index(['cluster', aggregator]).droplevel(level=1, axis=1).plot(ax=ax,
kind='bar',
alpha=.7)
plt.title('CLUSTER DIFFERENCES - ' + cluster_attribute.upper(), fontsize=14)
plt.ylabel(cluster_attribute)
plt.legend(loc='lower right')
plt.xticks(rotation=30, horizontalalignment="right");
fig, ax = plt.subplots(figsize=(10, 4))
cluster_attribute = 'hour_of_day'
aggregator = 'charge_description'
df_cluster_sub = df_clusters[(df_clusters.cluster == 101) |
(df_clusters.cluster == 98)]
df_cluster_sub.groupby(['cluster', aggregator]).agg(['mean']).loc[:,[cluster_attribute]].reset_index().merge(
df.groupby([aggregator]).agg(['mean']).loc[:,[cluster_attribute]].reset_index(),
how='left',
on=aggregator,
suffixes=('_cluster', '_all_data')).set_index(['cluster', aggregator]).droplevel(level=1, axis=1).plot(ax=ax,
kind='bar',
alpha=.7)
plt.title('CLUSTER DIFFERENCES - ' + cluster_attribute.upper(), fontsize=14)
plt.ylabel(cluster_attribute)
plt.legend(loc='lower right')
plt.xticks(rotation=30, horizontalalignment="right");
This cluster represents the youngest mean age within a cluster with 360 people receiving citations for drug paraphernelia within beat 6. Most of these took place early in the day on the weekend.
df_clusters[(df_clusters.cluster == 78)].shape
(360, 7)
fig, ax = plt.subplots(figsize=(10, 4))
cluster_attribute = 'cited_person_age'
aggregator = 'charge_description'
df_cluster_sub = df_clusters[(df_clusters.cluster == 78)]
df_cluster_sub.groupby(['cluster', aggregator]).agg(['mean']).loc[:,[cluster_attribute]].reset_index().merge(
df.groupby([aggregator]).agg(['mean']).loc[:,[cluster_attribute]].reset_index(),
how='left',
on=aggregator,
suffixes=('_cluster', '_all_data')).set_index(['cluster', aggregator]).droplevel(level=1, axis=1).plot(ax=ax,
kind='bar',
alpha=.7)
plt.title('CLUSTER DIFFERENCES - ' + cluster_attribute.upper(), fontsize=14)
plt.ylabel(cluster_attribute)
plt.legend(loc='lower right')
plt.xticks(rotation=30, horizontalalignment="right");
fig, ax = plt.subplots(figsize=(10, 4))
cluster_attribute = 'cited_person_age'
aggregator = 'beat'
df_cluster_sub = df_clusters[(df_clusters.cluster == 78)]
df_cluster_sub.groupby(['cluster', aggregator]).agg(['mean']).loc[:,[cluster_attribute]].reset_index().merge(
df.groupby([aggregator]).agg(['mean']).loc[:,[cluster_attribute]].reset_index(),
how='left',
on=aggregator,
suffixes=('_cluster', '_all_data')).set_index(['cluster', aggregator]).droplevel(level=1, axis=1).plot(ax=ax,
kind='bar',
alpha=.7)
plt.title('CLUSTER DIFFERENCES - ' + cluster_attribute.upper(), fontsize=14)
plt.ylabel(cluster_attribute)
plt.legend(loc='lower right')
plt.xticks(rotation=30, horizontalalignment="right");
fig, ax = plt.subplots(figsize=(10, 4))
cluster_attribute = 'day_of_week'
aggregator = 'charge_description'
df_cluster_sub = df_clusters[(df_clusters.cluster == 78)]
df_cluster_sub.groupby(['cluster', aggregator]).agg(['mean']).loc[:,[cluster_attribute]].reset_index().merge(
df.groupby([aggregator]).agg(['mean']).loc[:,[cluster_attribute]].reset_index(),
how='left',
on=aggregator,
suffixes=('_cluster', '_all_data')).set_index(['cluster', aggregator]).droplevel(level=1, axis=1).plot(ax=ax,
kind='bar',
alpha=.7)
plt.title('CLUSTER DIFFERENCES - ' + cluster_attribute.upper(), fontsize=14)
plt.ylabel(cluster_attribute)
plt.legend(loc='lower right')
plt.xticks(rotation=30, horizontalalignment="right");
fig, ax = plt.subplots(figsize=(10, 4))
cluster_attribute = 'hour_of_day'
aggregator = 'charge_description'
df_cluster_sub = df_clusters[(df_clusters.cluster == 78)]
df_cluster_sub.groupby(['cluster', aggregator]).agg(['mean']).loc[:,[cluster_attribute]].reset_index().merge(
df.groupby([aggregator]).agg(['mean']).loc[:,[cluster_attribute]].reset_index(),
how='left',
on=aggregator,
suffixes=('_cluster', '_all_data')).set_index(['cluster', aggregator]).droplevel(level=1, axis=1).plot(ax=ax,
kind='bar',
alpha=.7)
plt.title('CLUSTER DIFFERENCES - ' + cluster_attribute.upper(), fontsize=14)
plt.ylabel(cluster_attribute)
plt.legend(loc='lower right')
plt.xticks(rotation=30, horizontalalignment="right");
Examining the second best parameter configuration based on silhouette coefficient.
plt.figure(figsize=(6, 6))
dbscan_cluster = DBSCAN(eps=1, min_samples=10)
clusters = dbscan_cluster.fit_predict(X_std)
pd.Series(clusters).value_counts().plot(kind='pie', title='Cluster Distribution');
Exploring parameters from the next highest silhouette coefficient, a larger share of citations are categorized as not fitting into any cluster.
Clusters 1 and 37 seem to show the most interesting trends.
This cluster has the lowest age, along with the highest values for day_of_week and hour_of_day.
This cluster is made up of over 600 citations representing much younger people who receive speeding tickets in beat #6 later in the day, and later in the week.
This cluster is made up of over 600 citations representing people who receive speeding tickets in beat #10. This cluster is less defined that that of cluster 37.
Compared with the previous DBSCAN iteration, fewer citations are categorized as not fitting into a cluster.
In an effort to identify smaller clusters with characteristics different from the full citation datset, I focused the DBSCAN model with a smaller sample size.
This cluster is made up of 46 citations representing people who receive DUI tickets in beat #6. All of these tickets took place in the early morning hours on Sunday with an average month of the year of August. Additionaly, these tickets were received by people at least 10 year younger than average for this citation type.
This cluster is quite similar to cluster 17 above. It is made up of 79 citations representing people younger people who received DUI tickets in beat #6. The two exceptions from cluster 17 is that the average month of the year is August, and all these citations were received in the early morning on Saturday, rather than Sunday.
This model did a good job of finding very tight clusters with highly similar characteristics. This model overwhelmingly clustered based off the categorical features (citation type, and location(beat)).
# Computing the first principal components
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_std)
plt.figure(figsize=(15,10))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=pd.Series(clusters), alpha=.5);
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(X_std)
[t-SNE] Computing 121 nearest neighbors... [t-SNE] Indexed 38417 samples in 0.814s... [t-SNE] Computed neighbors for 38417 samples in 38.680s... [t-SNE] Computed conditional probabilities for sample 1000 / 38417 [t-SNE] Computed conditional probabilities for sample 2000 / 38417 [t-SNE] Computed conditional probabilities for sample 3000 / 38417 [t-SNE] Computed conditional probabilities for sample 4000 / 38417 [t-SNE] Computed conditional probabilities for sample 5000 / 38417 [t-SNE] Computed conditional probabilities for sample 6000 / 38417 [t-SNE] Computed conditional probabilities for sample 7000 / 38417 [t-SNE] Computed conditional probabilities for sample 8000 / 38417 [t-SNE] Computed conditional probabilities for sample 9000 / 38417 [t-SNE] Computed conditional probabilities for sample 10000 / 38417 [t-SNE] Computed conditional probabilities for sample 11000 / 38417 [t-SNE] Computed conditional probabilities for sample 12000 / 38417 [t-SNE] Computed conditional probabilities for sample 13000 / 38417 [t-SNE] Computed conditional probabilities for sample 14000 / 38417 [t-SNE] Computed conditional probabilities for sample 15000 / 38417 [t-SNE] Computed conditional probabilities for sample 16000 / 38417 [t-SNE] Computed conditional probabilities for sample 17000 / 38417 [t-SNE] Computed conditional probabilities for sample 18000 / 38417 [t-SNE] Computed conditional probabilities for sample 19000 / 38417 [t-SNE] Computed conditional probabilities for sample 20000 / 38417 [t-SNE] Computed conditional probabilities for sample 21000 / 38417 [t-SNE] Computed conditional probabilities for sample 22000 / 38417 [t-SNE] Computed conditional probabilities for sample 23000 / 38417 [t-SNE] Computed conditional probabilities for sample 24000 / 38417 [t-SNE] Computed conditional probabilities for sample 25000 / 38417 [t-SNE] Computed conditional probabilities for sample 26000 / 38417 [t-SNE] Computed conditional probabilities for sample 27000 / 38417 [t-SNE] Computed conditional probabilities for sample 28000 / 38417 [t-SNE] Computed conditional probabilities for sample 29000 / 38417 [t-SNE] Computed conditional probabilities for sample 30000 / 38417 [t-SNE] Computed conditional probabilities for sample 31000 / 38417 [t-SNE] Computed conditional probabilities for sample 32000 / 38417 [t-SNE] Computed conditional probabilities for sample 33000 / 38417 [t-SNE] Computed conditional probabilities for sample 34000 / 38417 [t-SNE] Computed conditional probabilities for sample 35000 / 38417 [t-SNE] Computed conditional probabilities for sample 36000 / 38417 [t-SNE] Computed conditional probabilities for sample 37000 / 38417 [t-SNE] Computed conditional probabilities for sample 38000 / 38417 [t-SNE] Computed conditional probabilities for sample 38417 / 38417 [t-SNE] Mean sigma: 0.427159 [t-SNE] KL divergence after 250 iterations with early exaggeration: 83.370750 [t-SNE] KL divergence after 300 iterations: 3.953100
plt.figure(figsize=(10, 10))
plt.scatter(tsne_results[:, 0], tsne_results[:, 1], alpha=.5, c=pd.Series(clusters));
umap_results = umap.UMAP(n_neighbors=5,
min_dist=0.3,
metric='correlation').fit_transform(X_std)
plt.figure(figsize=(10, 10))
plt.scatter(umap_results[:, 0], umap_results[:, 1], alpha=.5, c=pd.Series(clusters));
GMM Parameters to explore
# find best GMM parameters
sil_score = []
num_components = []
cov_type = []
for n_components in range(10, 31, 5):
for covariance_type in ['full', 'tied', 'diag', 'spherical']:
# Defining the agglomerative clustering
gmm_cluster = GaussianMixture(n_components=n_components, covariance_type=covariance_type)
# Fit model
clusters = gmm_cluster.fit_predict(X_std)
sil_score.append(round(metrics.silhouette_score(X_std, clusters, metric='euclidean'), 3))
# record parameter values
num_components.append(n_components)
cov_type.append(covariance_type)
# tabulate param results from above
# sort by silhouette score & cluster count
cluster_scores = pd.DataFrame(list(list(zip(sil_score, num_components, cov_type))),
columns=['sil_score', 'num_components', 'covariance_type']) \
.sort_values(by=['sil_score'], ascending=False)
cluster_scores
| sil_score | num_components | covariance_type | |
|---|---|---|---|
| 12 | 0.308 | 25 | full |
| 14 | 0.302 | 25 | diag |
| 8 | 0.301 | 20 | full |
| 16 | 0.286 | 30 | full |
| 15 | 0.281 | 25 | spherical |
| 18 | 0.276 | 30 | diag |
| 19 | 0.267 | 30 | spherical |
| 13 | 0.256 | 25 | tied |
| 17 | 0.254 | 30 | tied |
| 9 | 0.254 | 20 | tied |
| 10 | 0.243 | 20 | diag |
| 7 | 0.221 | 15 | spherical |
| 11 | 0.215 | 20 | spherical |
| 6 | 0.214 | 15 | diag |
| 5 | 0.210 | 15 | tied |
| 4 | 0.206 | 15 | full |
| 1 | 0.151 | 10 | tied |
| 2 | 0.148 | 10 | diag |
| 0 | 0.136 | 10 | full |
| 3 | 0.032 | 10 | spherical |
# plot silhouette score and cluster count
fig, (ax0, ax1) = plt.subplots(nrows=2,ncols=1, sharex=True, figsize=(10, 6))
cluster_scores.loc[:,['sil_score']] \
.plot(kind='bar', color='green', legend=False, ax=ax0)
ax0.set_title('SILHOUETTE SCORE', fontsize=15)
cluster_scores.loc[:,['num_components']] \
.plot(kind='bar', legend=False, ax=ax1);
ax1.set_title('CLUSTER COUNT', fontsize=15)
plt.xticks(rotation=1);
The silhouette score at 25 components is significantly higher than that of DBSCAN at .197.
# create model
gmm_cluster = GaussianMixture(n_components=25, covariance_type='full')
plt.figure(figsize=(6, 6))
clusters = gmm_cluster.fit_predict(X_std)
pd.Series(clusters).value_counts().plot(kind='pie', title='Cluster Distribution')
plt.ylabel('');
Compared to DBSCAN, the size distribution of GMM is much more distributed.
# join cluster assignment to df
df_clusters = pd.concat([df.reset_index(drop=True), pd.Series(clusters).rename('cluster')], axis=1)
plot_clusters_scaled(df_clusters)
plot_clusters_scaled(df_clusters)
This cluster is made up of tickets for speeding in school zones. It has a high diversity of beat locations. This citation also happens mostly early in the morning, and almost exclusively early in the week.
plot_clusters(df, df_clusters, 'beat', 'cited_person_age', 22)
plot_clusters(df, df_clusters, 'beat', 'hour_of_day', 22)
plot_clusters(df, df_clusters, 'beat', 'day_of_week', 22)
plot_clusters(df, df_clusters, 'charge_description', 'cited_person_age', 22)
This cluster is made up of XXX people who received a citation with beat 20. This cluster is made up of a high number of uniqe citation types with no unique qualities about the time of day or day of the week these citations were issued.
plot_clusters(df, df_clusters, 'charge_description', 'cited_person_age', 6)
plot_clusters(df, df_clusters, 'beat', 'cited_person_age', 6)
This cluster has the latest average time of day and is made up 1136 people who receive citations for "Expired/No Arizona Registration". This cluster's location is exclusively within beat 11.
df_clusters[df_clusters.cluster == 8].shape
(1136, 7)
plot_clusters(df, df_clusters, 'charge_description', 'hour_of_day', 8)
plot_clusters(df, df_clusters, 'charge_description', 'cited_person_age', 8)
plot_clusters(df, df_clusters, 'beat', 'day_of_week', 8)
Although this model classified all citations into one of 25 clusters, the specific characteristics of each cluster were not as strong as those within the smaller DBSCAN clusters.
This model was able to cluster more strongly than DBSCAN using characteristics other than citation type and location. Whereas all the DBSCAN clusters were made up of onlye one citation type and one location, a number of GMM clusters included a number of unique citation types and locations.
# Computing the first principal components
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_std)
plt.figure(figsize=(15,10))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=pd.Series(clusters), alpha=.5);
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(X_std)
[t-SNE] Computing 121 nearest neighbors... [t-SNE] Indexed 38417 samples in 0.664s... [t-SNE] Computed neighbors for 38417 samples in 34.447s... [t-SNE] Computed conditional probabilities for sample 1000 / 38417 [t-SNE] Computed conditional probabilities for sample 2000 / 38417 [t-SNE] Computed conditional probabilities for sample 3000 / 38417 [t-SNE] Computed conditional probabilities for sample 4000 / 38417 [t-SNE] Computed conditional probabilities for sample 5000 / 38417 [t-SNE] Computed conditional probabilities for sample 6000 / 38417 [t-SNE] Computed conditional probabilities for sample 7000 / 38417 [t-SNE] Computed conditional probabilities for sample 8000 / 38417 [t-SNE] Computed conditional probabilities for sample 9000 / 38417 [t-SNE] Computed conditional probabilities for sample 10000 / 38417 [t-SNE] Computed conditional probabilities for sample 11000 / 38417 [t-SNE] Computed conditional probabilities for sample 12000 / 38417 [t-SNE] Computed conditional probabilities for sample 13000 / 38417 [t-SNE] Computed conditional probabilities for sample 14000 / 38417 [t-SNE] Computed conditional probabilities for sample 15000 / 38417 [t-SNE] Computed conditional probabilities for sample 16000 / 38417 [t-SNE] Computed conditional probabilities for sample 17000 / 38417 [t-SNE] Computed conditional probabilities for sample 18000 / 38417 [t-SNE] Computed conditional probabilities for sample 19000 / 38417 [t-SNE] Computed conditional probabilities for sample 20000 / 38417 [t-SNE] Computed conditional probabilities for sample 21000 / 38417 [t-SNE] Computed conditional probabilities for sample 22000 / 38417 [t-SNE] Computed conditional probabilities for sample 23000 / 38417 [t-SNE] Computed conditional probabilities for sample 24000 / 38417 [t-SNE] Computed conditional probabilities for sample 25000 / 38417 [t-SNE] Computed conditional probabilities for sample 26000 / 38417 [t-SNE] Computed conditional probabilities for sample 27000 / 38417 [t-SNE] Computed conditional probabilities for sample 28000 / 38417 [t-SNE] Computed conditional probabilities for sample 29000 / 38417 [t-SNE] Computed conditional probabilities for sample 30000 / 38417 [t-SNE] Computed conditional probabilities for sample 31000 / 38417 [t-SNE] Computed conditional probabilities for sample 32000 / 38417 [t-SNE] Computed conditional probabilities for sample 33000 / 38417 [t-SNE] Computed conditional probabilities for sample 34000 / 38417 [t-SNE] Computed conditional probabilities for sample 35000 / 38417 [t-SNE] Computed conditional probabilities for sample 36000 / 38417 [t-SNE] Computed conditional probabilities for sample 37000 / 38417 [t-SNE] Computed conditional probabilities for sample 38000 / 38417 [t-SNE] Computed conditional probabilities for sample 38417 / 38417 [t-SNE] Mean sigma: 0.427159 [t-SNE] KL divergence after 250 iterations with early exaggeration: 83.277061 [t-SNE] KL divergence after 300 iterations: 3.945654
plt.figure(figsize=(10, 10))
plt.scatter(tsne_results[:, 0], tsne_results[:, 1], alpha=.5, c=pd.Series(clusters));
umap_results = umap.UMAP(n_neighbors=5,
min_dist=0.3,
metric='correlation').fit_transform(X_std)
plt.figure(figsize=(10, 10))
plt.scatter(umap_results[:, 0], umap_results[:, 1], alpha=.5, c=pd.Series(clusters));
Since I'm not necessarily expecting clusters of equal sizes I will use 'average' linkage method.
# plt.figure(figsize=(6, 6))
# agg_cluster = AgglomerativeClustering(linkage='average',
# affinity='cosine',
# n_clusters=13)
# clusters = agg_cluster.fit_predict(X_std)
# pd.Series(clusters).value_counts().plot(kind='pie', title='Cluster Distribution');
Running the above produced "MemoryError: unable to allocate array data." So I sampled 10000 citations below.
# select 10000 random rows
rand_index = np.random.choice(X_std.shape[0], size=10000)
# subset the full dataset
X_std_sample = X_std[rand_index,:]
# find best parameters
cluster_count = []
sil_score = []
cluster_param = []
for i in range(2, 27, 4):
# Defining the clustering model
agg_cluster = AgglomerativeClustering(linkage='average',
affinity='cosine',
n_clusters=i)
# Fit model
clusters = agg_cluster.fit_predict(X_std_sample)
# capture cluster count
cluster_count.append(len(set(clusters)) - (1 if -1 in clusters else 0))
# capture model fit
if pd.Series(clusters).nunique() > 1:
sil_score.append(round(metrics.silhouette_score(X_std_sample, clusters, metric='euclidean'), 3))
else:
sil_score.append(np.nan)
# record parameter values
cluster_param.append(i)
# tabulate param results from above
# sort by silhouette score & cluster count
cluster_scores = pd.DataFrame(list(list(zip(sil_score, cluster_count, n_clusters_ct))),
columns=['sil_score', 'cluster_count', 'cluster_param']) \
.sort_values(by=['sil_score', 'cluster_count'], ascending=False)
cluster_scores
| sil_score | cluster_count | cluster_param | |
|---|---|---|---|
| 2 | 0.400 | 10 | 10 |
| 3 | 0.369 | 14 | 14 |
| 4 | 0.341 | 18 | 18 |
| 5 | 0.315 | 22 | 22 |
| 6 | 0.310 | 26 | 26 |
| 1 | 0.254 | 6 | 6 |
| 0 | 0.095 | 2 | 2 |
# plot silhouette score and cluster count
fig, (ax0, ax1) = plt.subplots(nrows=2,ncols=1, sharex=True, figsize=(10, 6))
cluster_scores.loc[:,['sil_score']] \
.plot(kind='bar', color='green', legend=False, ax=ax0)
ax0.set_title('SILHOUETTE SCORE', fontsize=15)
cluster_scores.loc[:,['cluster_count']] \
.plot(kind='bar', legend=False, ax=ax1);
ax1.set_title('CLUSTER COUNT', fontsize=15)
plt.xticks(rotation=1);
# create clustering model
agg_cluster = AgglomerativeClustering(linkage='average',
affinity='cosine',
n_clusters=10)
# use model to assign cluster labels
plt.figure(figsize=(6, 6))
clusters = agg_cluster.fit_predict(X_std_sample)
pd.Series(clusters).value_counts().plot(kind='pie', title='Cluster Distribution', )
plt.ylabel('');
pd.Series(clusters).value_counts(dropna=False)
0 2592 2 2382 4 1380 5 1055 1 722 3 464 8 397 9 395 6 320 7 293 dtype: int64
# join cluster assignment to df
df_clusters = pd.concat([df.iloc[pd.Series(rand_index)].reset_index(drop=True),
pd.Series(clusters).rename('cluster')],
axis=1)
plot_clusters_scaled(df_clusters)
This cluster, made up of 2592 citations, has the highest mean age and is made up of a number of unique citations and locations.
df_clusters[df_clusters.cluster == 0].shape
(2592, 7)
plot_clusters(df, df_clusters, 'beat', 'cited_person_age', 0)
plot_clusters(df, df_clusters, 'charge_description', 'cited_person_age', 0)
plot_clusters(df, df_clusters, 'charge_description', 'day_of_week', 0)
plot_clusters(df, df_clusters, 'charge_description', 'hour_of_day', 0)
fig = plt.figure(figsize=(6, 6))
ax = Axes3D(fig)
ax.scatter(df_clusters.loc[:,'cited_person_age'],
df_clusters.loc[:,'day_of_week'],
df_clusters.loc[:,'hour_of_day'],
c='grey',
alpha=.05)
ax.scatter(df_clusters[(df_clusters.cluster == 0)].loc[:,'cited_person_age'],
df_clusters[(df_clusters.cluster == 0)].loc[:,'day_of_week'],
df_clusters[(df_clusters.cluster == 0)].loc[:,'hour_of_day'],
c='red',
alpha=.8)
ax.set_xlabel('Age')
ax.set_ylabel('Day of Week')
ax.set_zlabel('Hour of Day')
ax.view_init(elev=5)
# , azim=45)
This cluster, made up of 2382 citations, has the lowest mean age and is made up of a number of unique citations and locations.
df_clusters[df_clusters.cluster == 2].shape
(2382, 7)
plot_clusters(df, df_clusters, 'beat', 'cited_person_age', 2)
plot_clusters(df, df_clusters, 'charge_description', 'cited_person_age', 2)
plot_clusters(df, df_clusters, 'charge_description', 'day_of_week', 2)
plot_clusters(df, df_clusters, 'charge_description', 'hour_of_day', 2)
fig = plt.figure(figsize=(6, 6))
ax = Axes3D(fig)
ax.scatter(df_clusters.loc[:,'cited_person_age'],
df_clusters.loc[:,'day_of_week'],
df_clusters.loc[:,'hour_of_day'],
c='grey',
alpha=.05)
ax.scatter(df_clusters[(df_clusters.cluster == 2)].loc[:,'cited_person_age'],
df_clusters[(df_clusters.cluster == 2)].loc[:,'day_of_week'],
df_clusters[(df_clusters.cluster == 2)].loc[:,'hour_of_day'],
c='red',
alpha=.8)
ax.set_xlabel('Age')
ax.set_ylabel('Day of Week')
ax.set_zlabel('Hour of Day')
ax.view_init(elev=5)
# , azim=45)
This cluster, made up of 464 citations, is associated with the latest mean hour of the day.
df_clusters[df_clusters.cluster == 3].shape
(464, 7)
plot_clusters(df, df_clusters, 'charge_description', 'hour_of_day', 3)
plot_clusters(df, df_clusters, 'beat', 'hour_of_day', 3)
fig = plt.figure(figsize=(6, 6))
ax = Axes3D(fig)
ax.scatter(df_clusters.loc[:,'cited_person_age'],
df_clusters.loc[:,'day_of_week'],
df_clusters.loc[:,'hour_of_day'],
c='grey',
alpha=.05)
ax.scatter(df_clusters[(df_clusters.cluster == 3)].loc[:,'cited_person_age'],
df_clusters[(df_clusters.cluster == 3)].loc[:,'day_of_week'],
df_clusters[(df_clusters.cluster == 3)].loc[:,'hour_of_day'],
c='red',
alpha=.8)
ax.set_xlabel('Age')
ax.set_ylabel('Day of Week')
ax.set_zlabel('Hour of Day')
ax.view_init(elev=20)
# , azim=45)
Although this model classified all citations into one of 10 clusters with a silhouette coefficient higher than either GMM or DBSCAN, the specific characteristics of each cluster were not as strong as those within the smaller DBSCAN or GMM clusters.